In [3]:
import pandas as pd

# Load the CSV file with semicolon as delimiter
data = pd.read_csv("Customer_Behavior_Data.csv", delimiter=";")

# Display the first few rows to check the structure
print(data.head())
   account length  location code  user id credit card info save push status  \
0             128            415  3824657                    no         yes   
1             107            415  3717191                    no         yes   
2             137            415  3581921                    no          no   
3              84            408  3759999                   yes          no   
4              75            415  3306626                   yes          no   

   add to wishlist  desktop sessions  app sessions  desktop transactions  \
0               25               265            45                    17   
1               26               162            27                    17   
2                0               243            41                    10   
3                0               299            51                     5   
4                0               167            28                    13   

   total product detail views  session duration  promotion clicks  \
0                         110               197                87   
1                         123               196               103   
2                         114               121               110   
3                          71                62                88   
4                         113               148               122   

  avg order value  sale product views discount rate per visited products  \
0           244,7                  91                              11,01   
1           254,4                 103                              11,45   
2           162,6                 104                               7,32   
3           196,9                  89                               8,86   
4           186,9                 121                               8,41   

  product detail view per app session  app transactions  \
0                                  10                 3   
1                                13,7                 3   
2                                12,2                 5   
3                                 6,6                 7   
4                                10,1                 3   

  add to cart per session  customer service calls  churn  
0                     2,7                       1      0  
1                     3,7                       1      0  
2                    3,29                       0      0  
3                    1,78                       2      0  
4                    2,73                       3      0  
In [4]:
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
In [5]:
data
Out[5]:
account length location code user id credit card info save push status add to wishlist desktop sessions app sessions desktop transactions total product detail views session duration promotion clicks avg order value sale product views discount rate per visited products product detail view per app session app transactions add to cart per session customer service calls churn
0 128 415 3824657 no yes 25 265 45 17 110 197 87 244,7 91 11,01 10 3 2,7 1 0
1 107 415 3717191 no yes 26 162 27 17 123 196 103 254,4 103 11,45 13,7 3 3,7 1 0
2 137 415 3581921 no no 0 243 41 10 114 121 110 162,6 104 7,32 12,2 5 3,29 0 0
3 84 408 3759999 yes no 0 299 51 5 71 62 88 196,9 89 8,86 6,6 7 1,78 2 0
4 75 415 3306626 yes no 0 167 28 13 113 148 122 186,9 121 8,41 10,1 3 2,73 3 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3328 192 415 4144276 no yes 36 156 27 18 77 216 126 279,1 83 12,56 9,9 6 2,67 2 0
3329 68 415 3703271 no no 0 231 39 13 57 153 55 191,3 123 8,61 9,6 4 2,59 3 0
3330 28 510 3288230 no no 0 181 31 25 109 289 58 191,9 91 8,64 14,1 6 3,81 2 0
3331 184 510 3646381 yes no 0 214 36 14 105 160 84 139,2 137 6,26 5 10 1,35 2 0
3332 74 415 4004344 no yes 25 234 40 23 113 266 82 241,4 77 10,86 13,7 4 3,7 0 0

3333 rows × 20 columns

In [6]:
# Get data types and check for missing values
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   account length                       3333 non-null   int64 
 1   location code                        3333 non-null   int64 
 2   user id                              3333 non-null   int64 
 3   credit card info save                3333 non-null   object
 4   push status                          3333 non-null   object
 5   add to wishlist                      3333 non-null   int64 
 6   desktop sessions                     3333 non-null   int64 
 7   app sessions                         3333 non-null   int64 
 8   desktop transactions                 3333 non-null   int64 
 9   total product detail views           3333 non-null   int64 
 10  session duration                     3333 non-null   int64 
 11  promotion clicks                     3333 non-null   int64 
 12  avg order value                      3333 non-null   object
 13  sale product views                   3333 non-null   int64 
 14  discount rate per visited products   3333 non-null   object
 15  product detail view per app session  3333 non-null   object
 16  app transactions                     3333 non-null   int64 
 17  add to cart per session              3333 non-null   object
 18  customer service calls               3333 non-null   int64 
 19  churn                                3333 non-null   int64 
dtypes: int64(14), object(6)
memory usage: 520.9+ KB
In [7]:
# Check for missing values
data.isnull().sum()
Out[7]:
0
account length 0
location code 0
user id 0
credit card info save 0
push status 0
add to wishlist 0
desktop sessions 0
app sessions 0
desktop transactions 0
total product detail views 0
session duration 0
promotion clicks 0
avg order value 0
sale product views 0
discount rate per visited products 0
product detail view per app session 0
app transactions 0
add to cart per session 0
customer service calls 0
churn 0

In [8]:
# Basic statistics
data.describe().T
Out[8]:
count mean std min 25% 50% 75% max
account length 3333.0 1.010648e+02 39.822106 1.0 74.0 101.0 127.0 243.0
location code 3333.0 4.371824e+02 42.371290 408.0 408.0 415.0 510.0 510.0
user id 3333.0 3.746291e+06 274662.573752 3271058.0 3508680.0 3748187.0 3985970.0 4229964.0
add to wishlist 3333.0 8.099010e+00 13.688365 0.0 0.0 0.0 20.0 51.0
desktop sessions 3333.0 1.798119e+02 54.457135 0.0 144.0 179.0 216.0 351.0
app sessions 3333.0 3.056796e+01 9.269376 0.0 24.0 31.0 37.0 60.0
desktop transactions 3333.0 1.708761e+01 4.323795 0.0 14.0 17.0 20.0 31.0
total product detail views 3333.0 1.004356e+02 20.069084 0.0 87.0 101.0 114.0 165.0
session duration 3333.0 2.010396e+02 50.714359 0.0 167.0 201.0 235.0 364.0
promotion clicks 3333.0 1.001107e+02 19.923911 0.0 87.0 100.0 114.0 170.0
sale product views 3333.0 1.001077e+02 19.568609 33.0 87.0 100.0 113.0 175.0
app transactions 3333.0 4.479448e+00 2.461214 0.0 3.0 4.0 6.0 20.0
customer service calls 3333.0 1.562856e+00 1.315491 0.0 1.0 1.0 2.0 9.0
churn 3333.0 1.449145e-01 0.352067 0.0 0.0 0.0 0.0 1.0
In [9]:
# Shape of the data
print("Data Shape:", data.shape)

# Unique values per column
unique_values = data.nunique()
unique_values
Data Shape: (3333, 20)
Out[9]:
0
account length 212
location code 3
user id 3333
credit card info save 2
push status 2
add to wishlist 46
desktop sessions 295
app sessions 60
desktop transactions 30
total product detail views 119
session duration 287
promotion clicks 123
avg order value 1591
sale product views 120
discount rate per visited products 933
product detail view per app session 162
app transactions 21
add to cart per session 162
customer service calls 10
churn 2

In [10]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 6))
sns.histplot(data['account length'], kde=True)
plt.title('Distribution of Account Length')
plt.xlabel('Account Length (days)')
plt.show()
No description has been provided for this image
In [11]:
plt.figure(figsize=(8, 6))
sns.histplot(data['desktop sessions'], color='blue', label='Desktop Sessions', kde=True)
sns.histplot(data['app sessions'], color='orange', label='App Sessions', kde=True)
plt.title('Desktop vs. App Sessions')
plt.xlabel('Number of Sessions')
plt.legend()
plt.show()
No description has been provided for this image
In [12]:
plt.figure(figsize=(8, 6))
sns.histplot(data['avg order value'], kde=True, color='green')
plt.title('Distribution of Average Order Value')
plt.xlabel('Average Order Value')
plt.show()
No description has been provided for this image
In [13]:
# Count plot for churn status
plt.figure(figsize=(8, 6))
sns.countplot(x='churn', data=data, palette='viridis')
plt.title('Churn Status')
plt.xlabel('Churn')
plt.ylabel('Number of Customers')
plt.show()
<ipython-input-13-1e512c34adce>:3: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x='churn', data=data, palette='viridis')
No description has been provided for this image
In [14]:
# Count plots for each categorical variable
import matplotlib.pyplot as plt
import seaborn as sns

categorical_cols = ['credit card info save', 'push status']  # Replace with actual categorical column names

for col in categorical_cols:
    plt.figure(figsize=(8, 5))
    sns.countplot(data=data, x=col, palette="viridis")
    plt.title(f"Count of {col}")
    plt.show()
<ipython-input-14-c5a6e8b1d039>:9: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=data, x=col, palette="viridis")
No description has been provided for this image
<ipython-input-14-c5a6e8b1d039>:9: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=data, x=col, palette="viridis")
No description has been provided for this image
In [15]:
# Pair plot for key numerical relationships, separated by churn status
sns.pairplot(data, hue='churn', palette="husl", diag_kind="kde")  # 'churn' is the target column
plt.suptitle("Pairwise Plot by Churn Status", y=1.02)
plt.show()
No description has been provided for this image
In [16]:
# Violin plot to compare distributions of app and desktop metrics by churn status
app_metrics = ['app sessions', 'app transactions']
desktop_metrics = ['desktop sessions', 'desktop transactions']

for col in app_metrics + desktop_metrics:
    plt.figure(figsize=(8, 5))
    sns.violinplot(data=data, x='churn', y=col, palette="muted")
    plt.title(f"{col} Distribution by Churn Status")
    plt.show()
<ipython-input-16-00ccb04ca475>:7: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(data=data, x='churn', y=col, palette="muted")
No description has been provided for this image
<ipython-input-16-00ccb04ca475>:7: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(data=data, x='churn', y=col, palette="muted")
No description has been provided for this image
<ipython-input-16-00ccb04ca475>:7: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(data=data, x='churn', y=col, palette="muted")
No description has been provided for this image
<ipython-input-16-00ccb04ca475>:7: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(data=data, x='churn', y=col, palette="muted")
No description has been provided for this image
In [17]:
import numpy as np
# Plotting the correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(data.select_dtypes(include=[np.number]).corr(), annot=True, cmap="coolwarm", fmt=".3f")
plt.title("Correlation Matrix of Numerical Columns")
plt.show()
No description has been provided for this image
In [18]:
summary_stats = data.describe().T
summary_stats['skew'] = data.select_dtypes(include=[np.number]).skew()  # Skewness shows asymmetry of distribution

summary_stats
Out[18]:
count mean std min 25% 50% 75% max skew
account length 3333.0 1.010648e+02 39.822106 1.0 74.0 101.0 127.0 243.0 0.096606
location code 3333.0 4.371824e+02 42.371290 408.0 408.0 415.0 510.0 510.0 1.126823
user id 3333.0 3.746291e+06 274662.573752 3271058.0 3508680.0 3748187.0 3985970.0 4229964.0 0.009732
add to wishlist 3333.0 8.099010e+00 13.688365 0.0 0.0 0.0 20.0 51.0 1.264824
desktop sessions 3333.0 1.798119e+02 54.457135 0.0 144.0 179.0 216.0 351.0 -0.028737
app sessions 3333.0 3.056796e+01 9.269376 0.0 24.0 31.0 37.0 60.0 -0.028420
desktop transactions 3333.0 1.708761e+01 4.323795 0.0 14.0 17.0 20.0 31.0 -0.010819
total product detail views 3333.0 1.004356e+02 20.069084 0.0 87.0 101.0 114.0 165.0 -0.111787
session duration 3333.0 2.010396e+02 50.714359 0.0 167.0 201.0 235.0 364.0 -0.024248
promotion clicks 3333.0 1.001107e+02 19.923911 0.0 87.0 100.0 114.0 170.0 -0.055096
sale product views 3333.0 1.001077e+02 19.568609 33.0 87.0 100.0 113.0 175.0 0.032500
app transactions 3333.0 4.479448e+00 2.461214 0.0 3.0 4.0 6.0 20.0 1.321478
customer service calls 3333.0 1.562856e+00 1.315491 0.0 1.0 1.0 2.0 9.0 1.091359
churn 3333.0 1.449145e-01 0.352067 0.0 0.0 0.0 0.0 1.0 2.018356
In [19]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Apply LabelEncoder on 'credit card info save' and 'push status'
data['credit card info save'] = label_encoder.fit_transform(data['credit card info save'])
data['push status'] = label_encoder.fit_transform(data['push status'])

# Display the first few rows to confirm the encoding
data.head()
Out[19]:
account length location code user id credit card info save push status add to wishlist desktop sessions app sessions desktop transactions total product detail views session duration promotion clicks avg order value sale product views discount rate per visited products product detail view per app session app transactions add to cart per session customer service calls churn
0 128 415 3824657 0 1 25 265 45 17 110 197 87 244,7 91 11,01 10 3 2,7 1 0
1 107 415 3717191 0 1 26 162 27 17 123 196 103 254,4 103 11,45 13,7 3 3,7 1 0
2 137 415 3581921 0 0 0 243 41 10 114 121 110 162,6 104 7,32 12,2 5 3,29 0 0
3 84 408 3759999 1 0 0 299 51 5 71 62 88 196,9 89 8,86 6,6 7 1,78 2 0
4 75 415 3306626 1 0 0 167 28 13 113 148 122 186,9 121 8,41 10,1 3 2,73 3 0
In [20]:
# Box plot for key activity metrics
columns_to_plot = ['desktop sessions', 'app sessions', 'desktop transactions',
                   'app transactions', 'session duration', 'promotion clicks']
for column in columns_to_plot:
    plt.figure(figsize=(8, 4))
    sns.boxplot(data[column])
    plt.title(f'Box Plot of {column}')
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [21]:
# Scatter plot of session duration vs. desktop transactions
plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='session duration', y='desktop transactions', hue='churn')
plt.title('Session Duration vs. Desktop Transactions')
plt.xlabel('Session Duration')
plt.ylabel('Desktop Transactions')
plt.show()
No description has been provided for this image
In [22]:
# Pair plot of selected features
sns.pairplot(data[['desktop sessions', 'app sessions', 'promotion clicks', 'session duration', 'churn']], hue='churn')
plt.suptitle('Pair Plot of Selected Features (Desktop, App Sessions, Promotions, Churn)', y=1.02)
plt.show()
No description has been provided for this image
In [23]:
# Bar plot of average transactions by churn status
plt.figure(figsize=(8, 5))
sns.barplot(data=data, x='churn', y='desktop transactions', estimator='mean', ci=None)
plt.title('Average Desktop Transactions by Churn Status')
plt.xlabel('Churn')
plt.ylabel('Average Desktop Transactions')
plt.show()
<ipython-input-23-016eed8ca896>:3: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(data=data, x='churn', y='desktop transactions', estimator='mean', ci=None)
No description has been provided for this image
In [24]:
# Scatter plot for session duration vs. desktop and app transactions
plt.figure(figsize=(12, 6))
sns.scatterplot(data=data, x='session duration', y='desktop transactions', hue='churn')
plt.title('Relationship between Session Duration and Desktop Transactions')
plt.xlabel('Session Duration (minutes)')
plt.ylabel('Desktop Transactions')
plt.show()

plt.figure(figsize=(12, 6))
sns.scatterplot(data=data, x='session duration', y='app transactions', hue='churn')
plt.title('Relationship between Session Duration and App Transactions')
plt.xlabel('Session Duration (minutes)')
plt.ylabel('App Transactions')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [25]:
# Scatter plot for wishlist additions vs. transactions
plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='add to wishlist', y='desktop transactions', hue='churn')
plt.title('Wishlist Additions vs. Desktop Transactions')
plt.xlabel('Add to Wishlist Count')
plt.ylabel('Desktop Transactions')
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='add to wishlist', y='app transactions', hue='churn')
plt.title('Wishlist Additions vs. App Transactions')
plt.xlabel('Add to Wishlist Count')
plt.ylabel('App Transactions')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [26]:
# Scatter plot for promotion clicks vs. desktop and app transactions
plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='promotion clicks', y='desktop transactions', hue='churn')
plt.title('Promotion Clicks vs. Desktop Transactions')
plt.xlabel('Promotion Clicks')
plt.ylabel('Desktop Transactions')
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='promotion clicks', y='app transactions', hue='churn')
plt.title('Promotion Clicks vs. App Transactions')
plt.xlabel('Promotion Clicks')
plt.ylabel('App Transactions')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [27]:
# Box plot of customer service calls by churn status
plt.figure(figsize=(8, 5))
sns.boxplot(data=data, x='churn', y='customer service calls')
plt.title('Customer Service Calls by Churn Status')
plt.xlabel('Churn Status')
plt.ylabel('Customer Service Calls')
plt.show()
No description has been provided for this image
In [31]:
data
Out[31]:
account length location code user id credit card info save push status add to wishlist desktop sessions app sessions desktop transactions total product detail views session duration promotion clicks avg order value sale product views discount rate per visited products product detail view per app session app transactions add to cart per session customer service calls churn
0 128 415 3824657 0 1 25 265 45 17 110 197 87 244,7 91 11,01 10 3 2,7 1 0
1 107 415 3717191 0 1 26 162 27 17 123 196 103 254,4 103 11,45 13,7 3 3,7 1 0
2 137 415 3581921 0 0 0 243 41 10 114 121 110 162,6 104 7,32 12,2 5 3,29 0 0
3 84 408 3759999 1 0 0 299 51 5 71 62 88 196,9 89 8,86 6,6 7 1,78 2 0
4 75 415 3306626 1 0 0 167 28 13 113 148 122 186,9 121 8,41 10,1 3 2,73 3 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3328 192 415 4144276 0 1 36 156 27 18 77 216 126 279,1 83 12,56 9,9 6 2,67 2 0
3329 68 415 3703271 0 0 0 231 39 13 57 153 55 191,3 123 8,61 9,6 4 2,59 3 0
3330 28 510 3288230 0 0 0 181 31 25 109 289 58 191,9 91 8,64 14,1 6 3,81 2 0
3331 184 510 3646381 1 0 0 214 36 14 105 160 84 139,2 137 6,26 5 10 1,35 2 0
3332 74 415 4004344 0 1 25 234 40 23 113 266 82 241,4 77 10,86 13,7 4 3,7 0 0

3333 rows × 20 columns

In [32]:
# Convert commas to periods
converted_data = data["discount rate per visited products"].replace(',', '.')

# Print the result
print(converted_data)
0       11,01
1       11,45
2        7,32
3        8,86
4        8,41
        ...  
3328    12,56
3329     8,61
3330     8,64
3331     6,26
3332    10,86
Name: discount rate per visited products, Length: 3333, dtype: object
In [32]:
 
In [33]:
data
Out[33]:
account length location code user id credit card info save push status add to wishlist desktop sessions app sessions desktop transactions total product detail views session duration promotion clicks avg order value sale product views discount rate per visited products product detail view per app session app transactions add to cart per session customer service calls churn
0 128 415 3824657 0 1 25 265 45 17 110 197 87 244,7 91 11,01 10 3 2,7 1 0
1 107 415 3717191 0 1 26 162 27 17 123 196 103 254,4 103 11,45 13,7 3 3,7 1 0
2 137 415 3581921 0 0 0 243 41 10 114 121 110 162,6 104 7,32 12,2 5 3,29 0 0
3 84 408 3759999 1 0 0 299 51 5 71 62 88 196,9 89 8,86 6,6 7 1,78 2 0
4 75 415 3306626 1 0 0 167 28 13 113 148 122 186,9 121 8,41 10,1 3 2,73 3 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3328 192 415 4144276 0 1 36 156 27 18 77 216 126 279,1 83 12,56 9,9 6 2,67 2 0
3329 68 415 3703271 0 0 0 231 39 13 57 153 55 191,3 123 8,61 9,6 4 2,59 3 0
3330 28 510 3288230 0 0 0 181 31 25 109 289 58 191,9 91 8,64 14,1 6 3,81 2 0
3331 184 510 3646381 1 0 0 214 36 14 105 160 84 139,2 137 6,26 5 10 1,35 2 0
3332 74 415 4004344 0 1 25 234 40 23 113 266 82 241,4 77 10,86 13,7 4 3,7 0 0

3333 rows × 20 columns

In [34]:
data.to_csv('data_new.csv', index=False)
In [35]:
# Remove all commas from the data
df = data.applymap(lambda x: str(x).replace(',', '') if isinstance(x, str) else x)
<ipython-input-35-5210660e964e>:2: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
  df = data.applymap(lambda x: str(x).replace(',', '') if isinstance(x, str) else x)
In [36]:
df
Out[36]:
account length location code user id credit card info save push status add to wishlist desktop sessions app sessions desktop transactions total product detail views session duration promotion clicks avg order value sale product views discount rate per visited products product detail view per app session app transactions add to cart per session customer service calls churn
0 128 415 3824657 0 1 25 265 45 17 110 197 87 2447 91 1101 10 3 27 1 0
1 107 415 3717191 0 1 26 162 27 17 123 196 103 2544 103 1145 137 3 37 1 0
2 137 415 3581921 0 0 0 243 41 10 114 121 110 1626 104 732 122 5 329 0 0
3 84 408 3759999 1 0 0 299 51 5 71 62 88 1969 89 886 66 7 178 2 0
4 75 415 3306626 1 0 0 167 28 13 113 148 122 1869 121 841 101 3 273 3 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3328 192 415 4144276 0 1 36 156 27 18 77 216 126 2791 83 1256 99 6 267 2 0
3329 68 415 3703271 0 0 0 231 39 13 57 153 55 1913 123 861 96 4 259 3 0
3330 28 510 3288230 0 0 0 181 31 25 109 289 58 1919 91 864 141 6 381 2 0
3331 184 510 3646381 1 0 0 214 36 14 105 160 84 1392 137 626 5 10 135 2 0
3332 74 415 4004344 0 1 25 234 40 23 113 266 82 2414 77 1086 137 4 37 0 0

3333 rows × 20 columns

In [38]:
 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   account length                       3333 non-null   int64 
 1   location code                        3333 non-null   int64 
 2   user id                              3333 non-null   int64 
 3   credit card info save                3333 non-null   int64 
 4   push status                          3333 non-null   int64 
 5   add to wishlist                      3333 non-null   int64 
 6   desktop sessions                     3333 non-null   int64 
 7   app sessions                         3333 non-null   int64 
 8   desktop transactions                 3333 non-null   int64 
 9   total product detail views           3333 non-null   int64 
 10  session duration                     3333 non-null   int64 
 11  promotion clicks                     3333 non-null   int64 
 12  avg order value                      3333 non-null   object
 13  sale product views                   3333 non-null   int64 
 14  discount rate per visited products   3333 non-null   object
 15  product detail view per app session  3333 non-null   object
 16  app transactions                     3333 non-null   int64 
 17  add to cart per session              3333 non-null   object
 18  customer service calls               3333 non-null   int64 
 19  churn                                3333 non-null   int64 
dtypes: int64(16), object(4)
memory usage: 520.9+ KB
In [39]:
# Convert columns with numeric strings to integers
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = pd.to_numeric(df[col], errors='coerce')
In [40]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
 #   Column                               Non-Null Count  Dtype
---  ------                               --------------  -----
 0   account length                       3333 non-null   int64
 1   location code                        3333 non-null   int64
 2   user id                              3333 non-null   int64
 3   credit card info save                3333 non-null   int64
 4   push status                          3333 non-null   int64
 5   add to wishlist                      3333 non-null   int64
 6   desktop sessions                     3333 non-null   int64
 7   app sessions                         3333 non-null   int64
 8   desktop transactions                 3333 non-null   int64
 9   total product detail views           3333 non-null   int64
 10  session duration                     3333 non-null   int64
 11  promotion clicks                     3333 non-null   int64
 12  avg order value                      3333 non-null   int64
 13  sale product views                   3333 non-null   int64
 14  discount rate per visited products   3333 non-null   int64
 15  product detail view per app session  3333 non-null   int64
 16  app transactions                     3333 non-null   int64
 17  add to cart per session              3333 non-null   int64
 18  customer service calls               3333 non-null   int64
 19  churn                                3333 non-null   int64
dtypes: int64(20)
memory usage: 520.9 KB
In [41]:
# Define features and target variable
X = df.drop(columns=['churn', 'user id'])  # Drop target and ID columns
y = df['churn']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
Out[41]:
RandomForestClassifier(random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(random_state=42)
In [42]:
# Predict and evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       857
           1       0.96      0.68      0.80       143

    accuracy                           0.95      1000
   macro avg       0.95      0.84      0.88      1000
weighted avg       0.95      0.95      0.95      1000

In [ ]: